-
Notifications
You must be signed in to change notification settings - Fork 13.6k
AMDGPU/GlobalISel: AMDGPURegBankLegalize #112864
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU/GlobalISel: AMDGPURegBankLegalize #112864
Conversation
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Petar Avramovic (petar-avramovic) ChangesLower G_ instructions that can't be inst-selected with register bank
Given LLTs on all operands after legalizer, some register bank RBLegalize goals:
Since this is the first patch that actually enables -new-reg-bank-select
Patch is 140.53 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/112864.diff 16 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
index 6f6ad5cf82cae1..244d58c2fd0810 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
@@ -107,3 +107,183 @@ void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) {
S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg());
}
}
+
+MachineInstrBuilder AMDGPU::buildReadAnyLaneB32(MachineIRBuilder &B,
+ const DstOp &SgprDst,
+ const SrcOp &VgprSrc,
+ const RegisterBankInfo &RBI) {
+ auto RFL = B.buildInstr(AMDGPU::G_READANYLANE, {SgprDst}, {VgprSrc});
+ Register Dst = RFL->getOperand(0).getReg();
+ Register Src = RFL->getOperand(1).getReg();
+ MachineRegisterInfo &MRI = *B.getMRI();
+ if (!MRI.getRegBankOrNull(Dst))
+ MRI.setRegBank(Dst, RBI.getRegBank(SGPRRegBankID));
+ if (!MRI.getRegBankOrNull(Src))
+ MRI.setRegBank(Src, RBI.getRegBank(VGPRRegBankID));
+ return RFL;
+}
+
+MachineInstrBuilder
+AMDGPU::buildReadAnyLaneSequenceOfB32(MachineIRBuilder &B, const DstOp &SgprDst,
+ const SrcOp &VgprSrc, LLT B32Ty,
+ const RegisterBankInfo &RBI) {
+ MachineRegisterInfo &MRI = *B.getMRI();
+ SmallVector<Register, 8> SgprDstParts;
+ auto Unmerge = B.buildUnmerge(B32Ty, VgprSrc);
+ for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) {
+ SgprDstParts.push_back(
+ buildReadAnyLaneB32(B, B32Ty, Unmerge.getReg(i), RBI).getReg(0));
+ }
+
+ auto Merge = B.buildMergeLikeInstr(SgprDst, SgprDstParts);
+ MRI.setRegBank(Merge.getReg(0), RBI.getRegBank(AMDGPU::SGPRRegBankID));
+ return Merge;
+}
+
+MachineInstrBuilder
+AMDGPU::buildReadAnyLaneSequenceOfS64(MachineIRBuilder &B, const DstOp &SgprDst,
+ const SrcOp &VgprSrc,
+ const RegisterBankInfo &RBI) {
+ LLT S32 = LLT::scalar(32);
+ LLT S64 = LLT::scalar(64);
+ MachineRegisterInfo &MRI = *B.getMRI();
+ SmallVector<Register, 8> SgprDstParts;
+ auto Unmerge = B.buildUnmerge(S64, VgprSrc);
+
+ for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) {
+ MRI.setRegBank(Unmerge.getReg(i), RBI.getRegBank(AMDGPU::VGPRRegBankID));
+ auto Unmerge64 = B.buildUnmerge(S32, Unmerge.getReg(i));
+ SmallVector<Register, 2> Unmerge64Parts;
+ Unmerge64Parts.push_back(
+ buildReadAnyLaneB32(B, S32, Unmerge64.getReg(0), RBI).getReg(0));
+ Unmerge64Parts.push_back(
+ buildReadAnyLaneB32(B, S32, Unmerge64.getReg(1), RBI).getReg(0));
+ Register MergeReg = B.buildMergeLikeInstr(S64, Unmerge64Parts).getReg(0);
+ MRI.setRegBank(MergeReg, RBI.getRegBank(AMDGPU::SGPRRegBankID));
+ SgprDstParts.push_back(MergeReg);
+ }
+
+ auto Merge = B.buildMergeLikeInstr(SgprDst, SgprDstParts);
+ MRI.setRegBank(Merge.getReg(0), RBI.getRegBank(AMDGPU::SGPRRegBankID));
+ return Merge;
+}
+
+MachineInstrBuilder AMDGPU::buildReadAnyLane(MachineIRBuilder &B,
+ const DstOp &SgprDst,
+ const SrcOp &VgprSrc,
+ const RegisterBankInfo &RBI) {
+ MachineRegisterInfo &MRI = *B.getMRI();
+ LLT S16 = LLT::scalar(16);
+ LLT S32 = LLT::scalar(32);
+ LLT S64 = LLT::scalar(64);
+ LLT S256 = LLT::scalar(256);
+ LLT V2S16 = LLT::fixed_vector(2, 16);
+ LLT Ty = SgprDst.getLLTTy(MRI);
+
+ if (Ty == S16) {
+ return B.buildTrunc(
+ SgprDst, buildReadAnyLaneB32(B, S32, B.buildAnyExt(S32, VgprSrc), RBI));
+ }
+
+ if (Ty == S32 || Ty == V2S16 ||
+ (Ty.isPointer() && Ty.getSizeInBits() == 32)) {
+ return buildReadAnyLaneB32(B, SgprDst, VgprSrc, RBI);
+ }
+
+ if (Ty == S64 || Ty == S256 || (Ty.isPointer() && Ty.getSizeInBits() == 64) ||
+ (Ty.isVector() && Ty.getElementType() == S32)) {
+ return buildReadAnyLaneSequenceOfB32(B, SgprDst, VgprSrc, S32, RBI);
+ }
+
+ if (Ty.isVector() && Ty.getElementType() == S16) {
+ return buildReadAnyLaneSequenceOfB32(B, SgprDst, VgprSrc, V2S16, RBI);
+ }
+
+ if (Ty.isVector() && Ty.getElementType() == S64) {
+ return buildReadAnyLaneSequenceOfS64(B, SgprDst, VgprSrc, RBI);
+ }
+
+ llvm_unreachable("Type not supported");
+}
+
+void AMDGPU::buildReadAnyLaneDst(MachineIRBuilder &B, MachineInstr &MI,
+ const RegisterBankInfo &RBI) {
+ MachineRegisterInfo &MRI = *B.getMRI();
+ Register Dst = MI.getOperand(0).getReg();
+ const RegisterBank *DstBank = MRI.getRegBankOrNull(Dst);
+ if (DstBank != &RBI.getRegBank(AMDGPU::SGPRRegBankID))
+ return;
+
+ Register VgprDst = MRI.createGenericVirtualRegister(MRI.getType(Dst));
+ MRI.setRegBank(VgprDst, RBI.getRegBank(AMDGPU::VGPRRegBankID));
+
+ MI.getOperand(0).setReg(VgprDst);
+ MachineBasicBlock *MBB = MI.getParent();
+ B.setInsertPt(*MBB, std::next(MI.getIterator()));
+ // readAnyLane VgprDst into Dst after MI.
+ buildReadAnyLane(B, Dst, VgprDst, RBI);
+ return;
+}
+
+bool AMDGPU::isLaneMask(Register Reg, MachineRegisterInfo &MRI,
+ const SIRegisterInfo *TRI) {
+ const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
+ if (RB && RB->getID() == VCCRegBankID)
+ return true;
+
+ const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
+ if (RC && TRI->isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1))
+ return true;
+
+ return false;
+}
+
+bool AMDGPU::isSgprRB(Register Reg, MachineRegisterInfo &MRI) {
+ const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
+ if (RB && RB->getID() == SGPRRegBankID)
+ return true;
+
+ return false;
+}
+
+bool AMDGPU::isVgprRB(Register Reg, MachineRegisterInfo &MRI) {
+ const RegisterBank *RB = MRI.getRegBankOrNull(Reg);
+ if (RB && RB->getID() == VGPRRegBankID)
+ return true;
+
+ return false;
+}
+
+void AMDGPU::cleanUpAfterCombine(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineInstr *Optional0) {
+ MI.eraseFromParent();
+ if (Optional0 && isTriviallyDead(*Optional0, MRI))
+ Optional0->eraseFromParent();
+}
+
+bool AMDGPU::hasSGPRS1(MachineFunction &MF, MachineRegisterInfo &MRI) {
+ for (auto &MBB : MF) {
+ for (auto &MI : make_early_inc_range(MBB)) {
+ for (MachineOperand &Op : MI.operands()) {
+ if (!Op.isReg())
+ continue;
+
+ Register Reg = Op.getReg();
+ if (!Reg.isVirtual())
+ continue;
+
+ if (!isSgprRB(Reg, MRI) || MRI.getType(Reg) != LLT::scalar(1))
+ continue;
+
+ MI.getParent()->dump();
+ MI.dump();
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+bool AMDGPU::isS1(Register Reg, MachineRegisterInfo &MRI) {
+ return MRI.getType(Reg) == LLT::scalar(1);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
index 4d504d0204d81a..bf812dd86fbd04 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
@@ -9,7 +9,11 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
+#include "AMDGPURegisterBankInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/DenseSet.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/Register.h"
#include <utility>
@@ -48,7 +52,58 @@ class IntrinsicLaneMaskAnalyzer {
// This will not be needed when we turn of LCSSA for global-isel.
void findLCSSAPhi(Register Reg);
};
+
+void buildReadAnyLaneS1(MachineIRBuilder &B, MachineInstr &MI,
+ const RegisterBankInfo &RBI);
+
+MachineInstrBuilder buildReadAnyLaneB32(MachineIRBuilder &B,
+ const DstOp &SgprDst,
+ const SrcOp &VgprSrc,
+ const RegisterBankInfo &RBI);
+
+MachineInstrBuilder buildReadAnyLaneSequenceOfB32(MachineIRBuilder &B,
+ const DstOp &SgprDst,
+ const SrcOp &VgprSrc,
+ LLT B32Ty,
+ const RegisterBankInfo &RBI);
+
+MachineInstrBuilder buildReadAnyLaneSequenceOfS64(MachineIRBuilder &B,
+ const DstOp &SgprDst,
+ const SrcOp &VgprSrc,
+ const RegisterBankInfo &RBI);
+
+MachineInstrBuilder buildReadAnyLane(MachineIRBuilder &B, const DstOp &SgprDst,
+ const SrcOp &VgprSrc,
+ const RegisterBankInfo &RBI);
+
+// Create new vgpr destination register for MI then move it to current
+// MI's sgpr destination using one or more G_READANYLANE instructions.
+void buildReadAnyLaneDst(MachineIRBuilder &B, MachineInstr &MI,
+ const RegisterBankInfo &RBI);
+
+// Share with SIRegisterInfo::isUniformReg? This could make uniformity info give
+// same result in later passes.
+bool isLaneMask(Register Reg, MachineRegisterInfo &MRI,
+ const SIRegisterInfo *TRI);
+
+bool isSgprRB(Register Reg, MachineRegisterInfo &MRI);
+
+bool isVgprRB(Register Reg, MachineRegisterInfo &MRI);
+
+template <typename SrcTy>
+inline MIPatternMatch::UnaryOp_match<SrcTy, AMDGPU::G_READANYLANE>
+m_GReadAnyLane(const SrcTy &Src) {
+ return MIPatternMatch::UnaryOp_match<SrcTy, AMDGPU::G_READANYLANE>(Src);
}
-}
+
+void cleanUpAfterCombine(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineInstr *Optional0 = nullptr);
+
+bool hasSGPRS1(MachineFunction &MF, MachineRegisterInfo &MRI);
+
+bool isS1(Register Reg, MachineRegisterInfo &MRI);
+
+} // namespace AMDGPU
+} // namespace llvm
#endif
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 800bdbe04cf70d..3e1a78050c8a2f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -217,6 +217,75 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
return true;
}
+bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const {
+ const DebugLoc &DL = I.getDebugLoc();
+ MachineBasicBlock *BB = I.getParent();
+
+ unsigned CmpOpc =
+ STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
+ MachineInstr *Cmp = BuildMI(*BB, &I, DL, TII.get(CmpOpc))
+ .addReg(I.getOperand(1).getReg())
+ .addImm(0);
+ if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))
+ return false;
+
+ Register DstReg = I.getOperand(0).getReg();
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
+
+ I.eraseFromParent();
+ return RBI.constrainGenericRegister(DstReg, AMDGPU::SGPR_32RegClass, *MRI);
+}
+
+bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const {
+ const DebugLoc &DL = I.getDebugLoc();
+ MachineBasicBlock *BB = I.getParent();
+
+ Register DstReg = I.getOperand(0).getReg();
+ Register SrcReg = I.getOperand(1).getReg();
+ std::optional<ValueAndVReg> Arg =
+ getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
+
+ if (Arg) {
+ const int64_t Value = Arg->Value.getZExtValue();
+ if (Value == 0) {
+ unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
+ BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
+ } else {
+ assert(Value == 1);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
+ .addReg(TRI.getExec());
+ }
+ I.eraseFromParent();
+ return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
+ }
+
+ // RBLegalize was ensures that SrcReg is bool in reg (high bits are 0).
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
+
+ unsigned SelectOpcode =
+ STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
+ MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
+ .addReg(TRI.getExec())
+ .addImm(0);
+
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
+}
+
+bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const {
+ Register DstReg = I.getOperand(0).getReg();
+ Register SrcReg = I.getOperand(1).getReg();
+
+ const DebugLoc &DL = I.getDebugLoc();
+ MachineBasicBlock *BB = I.getParent();
+
+ auto RFL = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
+ .addReg(SrcReg);
+
+ I.eraseFromParent();
+ return constrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
+}
+
bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
const Register DefReg = I.getOperand(0).getReg();
const LLT DefTy = MRI->getType(DefReg);
@@ -249,7 +318,21 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
}
}
- // TODO: Verify that all registers have the same bank
+ // If inputs have register bank, assign corresponding reg class.
+ // Note: registers don't need to have the same reg bank.
+ for (unsigned i = 1; i < I.getNumOperands(); i += 2) {
+ const Register SrcReg = I.getOperand(i).getReg();
+
+ const RegisterBank *RB = MRI->getRegBankOrNull(SrcReg);
+ if (RB) {
+ const LLT SrcTy = MRI->getType(SrcReg);
+ const TargetRegisterClass *SrcRC =
+ TRI.getRegClassForTypeOnBank(SrcTy, *RB);
+ if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
+ return false;
+ }
+ }
+
I.setDesc(TII.get(TargetOpcode::PHI));
return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
}
@@ -3656,6 +3739,12 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
return selectStackRestore(I);
case AMDGPU::G_PHI:
return selectPHI(I);
+ case AMDGPU::G_COPY_SCC_VCC:
+ return selectCOPY_SCC_VCC(I);
+ case AMDGPU::G_COPY_VCC_SCC:
+ return selectCOPY_VCC_SCC(I);
+ case AMDGPU::G_READANYLANE:
+ return selectReadAnyLane(I);
case TargetOpcode::G_CONSTANT:
case TargetOpcode::G_FCONSTANT:
default:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index df39ecbd61bce6..11bba12499f0ce 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -87,6 +87,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
bool constrainCopyLikeIntrin(MachineInstr &MI, unsigned NewOpc) const;
bool selectCOPY(MachineInstr &I) const;
+ bool selectCOPY_SCC_VCC(MachineInstr &I) const;
+ bool selectCOPY_VCC_SCC(MachineInstr &I) const;
+ bool selectReadAnyLane(MachineInstr &I) const;
bool selectPHI(MachineInstr &I) const;
bool selectG_TRUNC(MachineInstr &I) const;
bool selectG_SZA_EXT(MachineInstr &I) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURBLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURBLegalize.cpp
index 9a9722559377f6..7c348bf759cadc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURBLegalize.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURBLegalize.cpp
@@ -18,7 +18,13 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "AMDGPUGlobalISelUtils.h"
+#include "AMDGPURBLegalizeHelper.h"
+#include "GCNSubtarget.h"
+#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
+#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/InitializePasses.h"
#define DEBUG_TYPE "rb-legalize"
@@ -41,6 +47,9 @@ class AMDGPURBLegalize : public MachineFunctionPass {
StringRef getPassName() const override { return "AMDGPU RB Legalize"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetPassConfig>();
+ AU.addRequired<GISelCSEAnalysisWrapperPass>();
+ AU.addRequired<MachineUniformityAnalysisPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -56,6 +65,9 @@ class AMDGPURBLegalize : public MachineFunctionPass {
INITIALIZE_PASS_BEGIN(AMDGPURBLegalize, DEBUG_TYPE, "AMDGPU RB Legalize", false,
false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)
INITIALIZE_PASS_END(AMDGPURBLegalize, DEBUG_TYPE, "AMDGPU RB Legalize", false,
false)
@@ -69,6 +81,241 @@ FunctionPass *llvm::createAMDGPURBLegalizePass() {
using namespace AMDGPU;
+const RegBankLegalizeRules &getRules(const GCNSubtarget &ST,
+ MachineRegisterInfo &MRI) {
+ static std::mutex GlobalMutex;
+ static SmallDenseMap<unsigned, std::unique_ptr<RegBankLegalizeRules>>
+ CacheForRuleSet;
+ std::lock_guard<std::mutex> Lock(GlobalMutex);
+ if (!CacheForRuleSet.contains(ST.getGeneration())) {
+ auto Rules = std::make_unique<RegBankLegalizeRules>(ST, MRI);
+ CacheForRuleSet[ST.getGeneration()] = std::move(Rules);
+ } else {
+ CacheForRuleSet[ST.getGeneration()]->refreshRefs(ST, MRI);
+ }
+ return *CacheForRuleSet[ST.getGeneration()];
+}
+
bool AMDGPURBLegalize::runOnMachineFunction(MachineFunction &MF) {
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ // Setup the instruction builder with CSE.
+ std::unique_ptr<MachineIRBuilder> MIRBuilder;
+ const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
+ GISelCSEAnalysisWrapper &Wrapper =
+ getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
+ GISelCSEInfo *CSEInfo = nullptr;
+ GISelObserverWrapper Observer;
+
+ if (TPC.isGISelCSEEnabled()) {
+ MIRBuilder = std::make_unique<CSEMIRBuilder>();
+ CSEInfo = &Wrapper.get(TPC.getCSEConfig());
+ MIRBuilder->setCSEInfo(CSEInfo);
+ Observer.addObserver(CSEInfo);
+ MIRBuilder->setChangeObserver(Observer);
+ } else {
+ MIRBuilder = std::make_unique<MachineIRBuilder>();
+ }
+ MIRBuilder->setMF(MF);
+
+ RAIIDelegateInstaller DelegateInstaller(MF, &Observer);
+ RAIIMFObserverInstaller MFObserverInstaller(MF, Observer);
+
+ const MachineUniformityInfo &MUI =
+ getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
+ const RegisterBankInfo &RBI = *MF.getSubtarget().getRegBankInfo();
+
+ // RegBankLegalizeRules is initialized with assigning sets of IDs to opcodes.
+ const RegBankLegalizeRules &RBLRules = getRules(ST, MRI);
+
+ // Logic that does legalization based on IDs assigned to Opcode.
+ RegBankLegalizeHelper RBLegalizeHelper(*MIRBuilder, MRI, MUI, RBI, RBLRules);
+
+ SmallVector<MachineInstr *> AllInst;
+
+ for (auto &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ AllInst.push_back(&MI);
+ }
+ }
+
+ for (auto &MI : AllInst) {
+ if (!MI->isPreISelOpcode())
+ continue;
+
+ unsigned Opc = MI->getOpcode();
+
+ // Insert point for use operands needs some calculation.
+ if (Opc == G_PHI) {
+ RBLegalizeHelper.applyMappingPHI(*MI);
+ continue;
+ }
+
+ // Opcodes that support pretty much all combinations of reg banks and LLTs
+ // (except S1). There is no point in writing rules for them.
+ if (Opc == G_BUILD_VECTOR || Opc == G_UNMERGE_VALUES ||
+ Opc == G_MERGE_VALUES) {
+ RBLegalizeHelper.applyMappingTrivial(*MI);
+ continue;
+ }
+
+ // Opcodes that also support S1. S1 rules are in RegBankLegalizeRules.
+ // Remaining reg bank and LLT combinations are trivially accepted.
+ if ((Opc == G_CONSTANT || Opc == G_FCONSTANT || Opc == G_IMPLICIT_DEF) &&
+ !isS1(MI->getOperand(0).getReg(), MRI)) {
+ assert(isSgprRB(MI->getOperand(0).getReg(), MRI));
+ continue;
+ }
+
+ if (!RBLegalizeHelper.findRuleAndApplyMapping(*MI)) {
+ MI->dump();
+ llvm_unreachable("failed to match any of the rules");
+ }
+ }
+
+ LLT S1 = LLT::scalar(1);
+ LLT S16 = LLT::scalar(16...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
3bd3785
to
fbaf393
Compare
fbaf393
to
921a702
Compare
auto Unmerge = B.buildUnmerge(S64, VgprSrc); | ||
|
||
for (unsigned i = 0; i < Unmerge->getNumOperands() - 1; ++i) { | ||
MRI.setRegBank(Unmerge.getReg(i), RBI.getRegBank(AMDGPU::VGPRRegBankID)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use the direct VGPRRegBank pointer or pull this out of the loop
const DstOp &SgprDst, | ||
const SrcOp &VgprSrc, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SrcOp / DstOp are for MachineIRBuilder, and other code probably shouldn't be using them
if (Ty == S64 || Ty == S256 || (Ty.isPointer() && Ty.getSizeInBits() == 64) || | ||
(Ty.isVector() && Ty.getElementType() == S32)) { | ||
return buildReadAnyLaneSequenceOfB32(B, SgprDst, VgprSrc, S32, RBI); | ||
} | ||
|
||
if (Ty.isVector() && Ty.getElementType() == S16) { | ||
return buildReadAnyLaneSequenceOfB32(B, SgprDst, VgprSrc, V2S16, RBI); | ||
} | ||
|
||
if (Ty.isVector() && Ty.getElementType() == S64) { | ||
return buildReadAnyLaneSequenceOfS64(B, SgprDst, VgprSrc, RBI); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you simply this into one isLegalType predicate? This is just expanding out the 32-bit LCM type?
if (Slot != -1) { | ||
if (MUI.isUniform(Reg)) | ||
return Uni[Slot]; | ||
else |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No else after return
// Note: if fast rules are enabled, RegBankLLTMapping must be added in each | ||
// slot that could "match fast Predicate". If not, Invalid Mapping is | ||
// returned which results in failure, does not search "Slow Rules". | ||
if (FastTypes != No) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"No"?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Renamed to NoFastRules
int SetOfRulesForOpcode::getFastPredicateSlot( | ||
UniformityLLTOpPredicateID Ty) const { | ||
switch (FastTypes) { | ||
case Standard: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add braces
@@ -0,0 +1,258 @@ | |||
//===- AMDGPURBLegalizeRules -------------------------------------*- C++ -*-==// |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it possible to share with the existing legalize rules?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't think sharing is good option for this patch. RBLegalizeRules are much more flexible and I would assume faster because of "FastPredicateSlot".
If we add more IDs that would work with LLTs only we could rewrite Legalizer using RBLegalizeRules. Other way around is questionable, did not consider upgrading LegalityPredicate and LegalizeMutation to work with Register banks
if (!MRI.getRegBankOrNull(Dst)) | ||
MRI.setRegBank(Dst, RBI.getRegBank(SGPRRegBankID)); | ||
if (!MRI.getRegBankOrNull(Src)) | ||
MRI.setRegBank(Src, RBI.getRegBank(VGPRRegBankID)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Probably should add a contrainRegBank method to MRI, similar to constrainRegClass for this pattern
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How should it work in regards to possibility to insert illegal sgpr to vgpr copy and can it fail like register class version?
Or are we looking for something much simpler:
no reg bank -set reg bank
same reg bank - do nothing
different reg bank - insert copy
2124eb3
to
df50c85
Compare
921a702
to
db1cdae
Compare
df50c85
to
36c8a96
Compare
db1cdae
to
3370bba
Compare
36c8a96
to
69dde87
Compare
3370bba
to
3f085e7
Compare
2c240bd
to
31b4c0d
Compare
62dddba
to
f36fadc
Compare
31b4c0d
to
da35db7
Compare
ping |
f36fadc
to
6cf16c0
Compare
da35db7
to
2b10761
Compare
6cf16c0
to
41857ee
Compare
2b10761
to
15a9f49
Compare
return LLT::scalar(32); | ||
} | ||
|
||
static Register buildReadAnyLane(MachineIRBuilder &B, Register VgprSrc, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Move the function body to avoid forward declaring
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
there is circular dependency between buildReadAnyLane and unmergeReadAnyLane
return RC && TRI.isSGPRClass(RC) && MRI.getType(Reg) == LLT::scalar(1); | ||
} | ||
|
||
void cleanUpAfterCombine(MachineInstr &MI, MachineInstr *Optional0) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
static
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is member function of AMDGPURegBankLegalizeCombner since it needs access to MRI
Optional0->eraseFromParent(); | ||
} | ||
|
||
std::pair<MachineInstr *, Register> tryMatch(Register Src, unsigned Opcode) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why this instead of using mi_match?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is much more convenient, shorter and easier to read.
for mi_match need to declare MI and Reg to pass as arguments, then need to match same thing twice using something like m_all_of(m_MInstr(...), m_Reg(...))).
This is a simple opcode check. Also when written like this it allows for structured binding of MI and Reg
41857ee
to
8f99d8a
Compare
15a9f49
to
4f1b347
Compare
8f99d8a
to
c19466b
Compare
4f1b347
to
a93b3b6
Compare
c19466b
to
8bfa4f4
Compare
a93b3b6
to
d7060d4
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM aside from a small nit
#include "AMDGPURegisterBankInfo.h" | ||
#include "MCTargetDesc/AMDGPUMCTargetDesc.h" | ||
#include "llvm/ADT/DenseSet.h" | ||
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" | ||
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should avoid unnecessary #include
s in headers (for compile time reason). I doubt that all of these are really needed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated. Are we ready to merge the whole stack? This commit is still missing approval.
@@ -217,6 +217,74 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const { | |||
return true; | |||
} | |||
|
|||
bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think it's trivial to avoid this a priori. I agree that a separate cleanup optimization could do it, in any case I'd say it's best left to a separate focused change.
8bfa4f4
to
5a76ae6
Compare
d7060d4
to
5aed391
Compare
5a76ae6
to
db43f84
Compare
5aed391
to
42019ed
Compare
db43f84
to
1c5977e
Compare
42019ed
to
813ee0e
Compare
813ee0e
to
b95adf5
Compare
Lower G_ instructions that can't be inst-selected with register bank assignment from AMDGPURegBankSelect based on uniformity analysis. - Lower instruction to perform it on assigned register bank - Put uniform value in vgpr because SALU instruction is not available - Execute divergent instruction in SALU - "waterfall loop" Given LLTs on all operands after legalizer, some register bank assignments require lowering while other do not. Note: cases where all register bank assignments would require lowering are lowered in legalizer. AMDGPURegBankLegalize goals: - Define Rules: when and how to perform lowering - Goal of defining Rules it to provide high level table-like brief overview of how to lower generic instructions based on available target features and uniformity info (uniform vs divergent). - Fast search of Rules, depends on how complicated Rule.Predicate is - For some opcodes there would be too many Rules that are essentially all the same just for different combinations of types and banks. Write custom function that handles all cases. - Rules are made from enum IDs that correspond to each operand. Names of IDs are meant to give brief description what lowering does for each operand or the whole instruction. - AMDGPURegBankLegalizeHelper implements lowering algorithms Since this is the first patch that actually enables -new-reg-bank-select here is the summary of regression tests that were added earlier: - if instruction is uniform always select SALU instruction if available - eliminate back to back vgpr to sgpr to vgpr copies of uniform values - fast rules: small differences for standard and vector instruction - enabling Rule based on target feature - salu_float - how to specify lowering algorithm - vgpr S64 AND to S32 - on G_TRUNC in reg, it is up to user to deal with truncated bits G_TRUNC in reg is treated as no-op. - dealing with truncated high bits - ABS S16 to S32 - sgpr S1 phi lowering - new opcodes for vcc-to-scc and scc-to-vcc copies - lowering for vgprS1-to-vcc copy (formally this is vgpr-to-vcc G_TRUNC) - S1 zext and sext lowering to select - uniform and divergent S1 AND(OR and XOR) lowering - inst-selected into SALU instruction - divergent phi with uniform inputs - divergent instruction with temporal divergent use, source instruction is defined as uniform(AMDGPURegBankSelect) - missing temporal divergence lowering - uniform phi, because of undef incoming, is assigned to vgpr. Will be fixed in AMDGPURegBankSelect via another fix in machine uniformity analysis.
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/186/builds/5944 Here is the relevant piece of the build log for the reference
|
Lower G_ instructions that can't be inst-selected with register bank
assignment from AMDGPURegBankSelect based on uniformity analysis.
Given LLTs on all operands after legalizer, some register bank
assignments require lowering while other do not.
Note: cases where all register bank assignments would require lowering
are lowered in legalizer.
AMDGPURegBankLegalize goals:
overview of how to lower generic instructions based on available
target features and uniformity info (uniform vs divergent).
all the same just for different combinations of types and banks.
Write custom function that handles all cases.
Names of IDs are meant to give brief description what lowering does
for each operand or the whole instruction.
Since this is the first patch that actually enables -new-reg-bank-select
here is the summary of regression tests that were added earlier:
G_TRUNC in reg is treated as no-op.
SALU instruction
is defined as uniform(AMDGPURegBankSelect) - missing temporal
divergence lowering
fixed in AMDGPURegBankSelect via another fix in machine uniformity
analysis.